In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
In [3]:
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27321 entries, 0 to 27320
Data columns (total 80 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   UID                          27321 non-null  int64  
 1   BLOCKID                      0 non-null      float64
 2   SUMLEVEL                     27321 non-null  int64  
 3   COUNTYID                     27321 non-null  int64  
 4   STATEID                      27321 non-null  int64  
 5   state                        27321 non-null  object 
 6   state_ab                     27321 non-null  object 
 7   city                         27321 non-null  object 
 8   place                        27321 non-null  object 
 9   type                         27321 non-null  object 
 10  primary                      27321 non-null  object 
 11  zip_code                     27321 non-null  int64  
 12  area_code                    27321 non-null  int64  
 13  lat                          27321 non-null  float64
 14  lng                          27321 non-null  float64
 15  ALand                        27321 non-null  float64
 16  AWater                       27321 non-null  int64  
 17  pop                          27321 non-null  int64  
 18  male_pop                     27321 non-null  int64  
 19  female_pop                   27321 non-null  int64  
 20  rent_mean                    27007 non-null  float64
 21  rent_median                  27007 non-null  float64
 22  rent_stdev                   27007 non-null  float64
 23  rent_sample_weight           27007 non-null  float64
 24  rent_samples                 27007 non-null  float64
 25  rent_gt_10                   27007 non-null  float64
 26  rent_gt_15                   27007 non-null  float64
 27  rent_gt_20                   27007 non-null  float64
 28  rent_gt_25                   27007 non-null  float64
 29  rent_gt_30                   27007 non-null  float64
 30  rent_gt_35                   27007 non-null  float64
 31  rent_gt_40                   27007 non-null  float64
 32  rent_gt_50                   27007 non-null  float64
 33  universe_samples             27321 non-null  int64  
 34  used_samples                 27321 non-null  int64  
 35  hi_mean                      27053 non-null  float64
 36  hi_median                    27053 non-null  float64
 37  hi_stdev                     27053 non-null  float64
 38  hi_sample_weight             27053 non-null  float64
 39  hi_samples                   27053 non-null  float64
 40  family_mean                  27023 non-null  float64
 41  family_median                27023 non-null  float64
 42  family_stdev                 27023 non-null  float64
 43  family_sample_weight         27023 non-null  float64
 44  family_samples               27023 non-null  float64
 45  hc_mortgage_mean             26748 non-null  float64
 46  hc_mortgage_median           26748 non-null  float64
 47  hc_mortgage_stdev            26748 non-null  float64
 48  hc_mortgage_sample_weight    26748 non-null  float64
 49  hc_mortgage_samples          26748 non-null  float64
 50  hc_mean                      26721 non-null  float64
 51  hc_median                    26721 non-null  float64
 52  hc_stdev                     26721 non-null  float64
 53  hc_samples                   26721 non-null  float64
 54  hc_sample_weight             26721 non-null  float64
 55  home_equity_second_mortgage  26864 non-null  float64
 56  second_mortgage              26864 non-null  float64
 57  home_equity                  26864 non-null  float64
 58  debt                         26864 non-null  float64
 59  second_mortgage_cdf          26864 non-null  float64
 60  home_equity_cdf              26864 non-null  float64
 61  debt_cdf                     26864 non-null  float64
 62  hs_degree                    27131 non-null  float64
 63  hs_degree_male               27121 non-null  float64
 64  hs_degree_female             27098 non-null  float64
 65  male_age_mean                27132 non-null  float64
 66  male_age_median              27132 non-null  float64
 67  male_age_stdev               27132 non-null  float64
 68  male_age_sample_weight       27132 non-null  float64
 69  male_age_samples             27132 non-null  float64
 70  female_age_mean              27115 non-null  float64
 71  female_age_median            27115 non-null  float64
 72  female_age_stdev             27115 non-null  float64
 73  female_age_sample_weight     27115 non-null  float64
 74  female_age_samples           27115 non-null  float64
 75  pct_own                      27053 non-null  float64
 76  married                      27130 non-null  float64
 77  married_snp                  27130 non-null  float64
 78  separated                    27130 non-null  float64
 79  divorced                     27130 non-null  float64
dtypes: float64(62), int64(12), object(6)
memory usage: 16.7+ MB
In [4]:
train.head()
Out[4]:
UID BLOCKID SUMLEVEL COUNTYID STATEID state state_ab city place type ... female_age_mean female_age_median female_age_stdev female_age_sample_weight female_age_samples pct_own married married_snp separated divorced
0 267822 NaN 140 53 36 New York NY Hamilton Hamilton City ... 44.48629 45.33333 22.51276 685.33845 2618.0 0.79046 0.57851 0.01882 0.01240 0.08770
1 246444 NaN 140 141 18 Indiana IN South Bend Roseland City ... 36.48391 37.58333 23.43353 267.23367 1284.0 0.52483 0.34886 0.01426 0.01426 0.09030
2 245683 NaN 140 63 18 Indiana IN Danville Danville City ... 42.15810 42.83333 23.94119 707.01963 3238.0 0.85331 0.64745 0.02830 0.01607 0.10657
3 279653 NaN 140 127 72 Puerto Rico PR San Juan Guaynabo Urban ... 47.77526 50.58333 24.32015 362.20193 1559.0 0.65037 0.47257 0.02021 0.02021 0.10106
4 247218 NaN 140 161 20 Kansas KS Manhattan Manhattan City City ... 24.17693 21.58333 11.10484 1854.48652 3051.0 0.13046 0.12356 0.00000 0.00000 0.03109

5 rows × 80 columns

In [5]:
train.shape
Out[5]:
(27321, 80)
In [6]:
train.duplicated().sum()
Out[6]:
160
In [7]:
train.drop_duplicates(inplace=True)
In [8]:
train.shape
Out[8]:
(27161, 80)
In [9]:
train['UID'].nunique()
Out[9]:
27161

TOTAL ROWS ARE 27161 AND TOTAL UNIQUE ENTRIES IN UID COLUMN IS ALSO 27161. SO WE ARE GOING TO SET UID COLUMN AS INDEX.

In [10]:
train = train.set_index('UID')
In [11]:
print(train.shape)
train.head()
(27161, 79)
Out[11]:
BLOCKID SUMLEVEL COUNTYID STATEID state state_ab city place type primary ... female_age_mean female_age_median female_age_stdev female_age_sample_weight female_age_samples pct_own married married_snp separated divorced
UID
267822 NaN 140 53 36 New York NY Hamilton Hamilton City tract ... 44.48629 45.33333 22.51276 685.33845 2618.0 0.79046 0.57851 0.01882 0.01240 0.08770
246444 NaN 140 141 18 Indiana IN South Bend Roseland City tract ... 36.48391 37.58333 23.43353 267.23367 1284.0 0.52483 0.34886 0.01426 0.01426 0.09030
245683 NaN 140 63 18 Indiana IN Danville Danville City tract ... 42.15810 42.83333 23.94119 707.01963 3238.0 0.85331 0.64745 0.02830 0.01607 0.10657
279653 NaN 140 127 72 Puerto Rico PR San Juan Guaynabo Urban tract ... 47.77526 50.58333 24.32015 362.20193 1559.0 0.65037 0.47257 0.02021 0.02021 0.10106
247218 NaN 140 161 20 Kansas KS Manhattan Manhattan City City tract ... 24.17693 21.58333 11.10484 1854.48652 3051.0 0.13046 0.12356 0.00000 0.00000 0.03109

5 rows × 79 columns

In [12]:
train['BLOCKID'].nunique()
Out[12]:
0
In [13]:
train['primary'].unique()
Out[13]:
array(['tract'], dtype=object)
In [14]:
train['SUMLEVEL'].unique()
Out[14]:
array([140])
In [15]:
train = train.drop(['BLOCKID','SUMLEVEL','primary'],axis=1)

DROPPING 'BLOCKID' COLUMN AS IT IS HAVING ALL NULL VALUES AND 'SUMLEVEL' as well as 'primary' COLUMN AS IT IS HAVING ONLY 1 ENTRY.

In [16]:
train.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 27161 entries, 267822 to 265371
Data columns (total 76 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   COUNTYID                     27161 non-null  int64  
 1   STATEID                      27161 non-null  int64  
 2   state                        27161 non-null  object 
 3   state_ab                     27161 non-null  object 
 4   city                         27161 non-null  object 
 5   place                        27161 non-null  object 
 6   type                         27161 non-null  object 
 7   zip_code                     27161 non-null  int64  
 8   area_code                    27161 non-null  int64  
 9   lat                          27161 non-null  float64
 10  lng                          27161 non-null  float64
 11  ALand                        27161 non-null  float64
 12  AWater                       27161 non-null  int64  
 13  pop                          27161 non-null  int64  
 14  male_pop                     27161 non-null  int64  
 15  female_pop                   27161 non-null  int64  
 16  rent_mean                    26919 non-null  float64
 17  rent_median                  26919 non-null  float64
 18  rent_stdev                   26919 non-null  float64
 19  rent_sample_weight           26919 non-null  float64
 20  rent_samples                 26919 non-null  float64
 21  rent_gt_10                   26919 non-null  float64
 22  rent_gt_15                   26919 non-null  float64
 23  rent_gt_20                   26919 non-null  float64
 24  rent_gt_25                   26919 non-null  float64
 25  rent_gt_30                   26919 non-null  float64
 26  rent_gt_35                   26919 non-null  float64
 27  rent_gt_40                   26919 non-null  float64
 28  rent_gt_50                   26919 non-null  float64
 29  universe_samples             27161 non-null  int64  
 30  used_samples                 27161 non-null  int64  
 31  hi_mean                      26954 non-null  float64
 32  hi_median                    26954 non-null  float64
 33  hi_stdev                     26954 non-null  float64
 34  hi_sample_weight             26954 non-null  float64
 35  hi_samples                   26954 non-null  float64
 36  family_mean                  26931 non-null  float64
 37  family_median                26931 non-null  float64
 38  family_stdev                 26931 non-null  float64
 39  family_sample_weight         26931 non-null  float64
 40  family_samples               26931 non-null  float64
 41  hc_mortgage_mean             26719 non-null  float64
 42  hc_mortgage_median           26719 non-null  float64
 43  hc_mortgage_stdev            26719 non-null  float64
 44  hc_mortgage_sample_weight    26719 non-null  float64
 45  hc_mortgage_samples          26719 non-null  float64
 46  hc_mean                      26683 non-null  float64
 47  hc_median                    26683 non-null  float64
 48  hc_stdev                     26683 non-null  float64
 49  hc_samples                   26683 non-null  float64
 50  hc_sample_weight             26683 non-null  float64
 51  home_equity_second_mortgage  26801 non-null  float64
 52  second_mortgage              26801 non-null  float64
 53  home_equity                  26801 non-null  float64
 54  debt                         26801 non-null  float64
 55  second_mortgage_cdf          26801 non-null  float64
 56  home_equity_cdf              26801 non-null  float64
 57  debt_cdf                     26801 non-null  float64
 58  hs_degree                    27016 non-null  float64
 59  hs_degree_male               27007 non-null  float64
 60  hs_degree_female             26990 non-null  float64
 61  male_age_mean                27013 non-null  float64
 62  male_age_median              27013 non-null  float64
 63  male_age_stdev               27013 non-null  float64
 64  male_age_sample_weight       27013 non-null  float64
 65  male_age_samples             27013 non-null  float64
 66  female_age_mean              27000 non-null  float64
 67  female_age_median            27000 non-null  float64
 68  female_age_stdev             27000 non-null  float64
 69  female_age_sample_weight     27000 non-null  float64
 70  female_age_samples           27000 non-null  float64
 71  pct_own                      26954 non-null  float64
 72  married                      27011 non-null  float64
 73  married_snp                  27011 non-null  float64
 74  separated                    27011 non-null  float64
 75  divorced                     27011 non-null  float64
dtypes: float64(61), int64(10), object(5)
memory usage: 16.0+ MB
In [17]:
print(train.isnull().sum(axis=0))
COUNTYID         0
STATEID          0
state            0
state_ab         0
city             0
              ... 
pct_own        207
married        150
married_snp    150
separated      150
divorced       150
Length: 76, dtype: int64
In [18]:
train_na = train[train.isna().any(axis=1)]
In [19]:
print(train_na.shape)
train_na.head()
(576, 76)
Out[19]:
COUNTYID STATEID state state_ab city place type zip_code area_code lat ... female_age_mean female_age_median female_age_stdev female_age_sample_weight female_age_samples pct_own married married_snp separated divorced
UID
223593 19 4 Arizona AZ Tucson Littletown CDP 85734 520 32.067721 ... 40.02370 40.83333 8.49563 30.01695 161.0 NaN 0.16308 0.16308 0.02634 0.20499
233040 101 8 Colorado CO Pueblo Pueblo City City 81001 719 38.308407 ... 20.00784 19.25000 4.30291 172.56153 309.0 0.00000 0.00000 0.00000 0.00000 0.00000
263292 13 34 New Jersey NJ Newark Silver Lake City 7107 973 40.773830 ... 35.47667 35.58333 20.62717 369.61740 1671.0 0.24002 0.37411 0.05579 0.02504 0.07654
267158 47 36 New York NY Brooklyn New York City City 11215 718 40.659126 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
292484 25 55 Wisconsin WI Madison Madison City City 53703 608 43.073403 ... 22.03226 21.08333 5.13435 1365.86300 1981.0 0.00000 0.00773 0.00000 0.00000 0.01160

5 rows × 76 columns

In [20]:
null_data_percent = (train_na.shape[0]/train.shape[0])*100
null_data_percent
Out[20]:
2.1206877508191893

WE HAVE CHECKED PERCENTAGE OF NULL DATA PRESENT IN OUR DATAFRAME WHICH COMES OUT TO BE 2.12%, WHICH IS VERY LOW. IF WE DROP THESE ROWS NOT MUCH INFORMATION IS LOST

In [21]:
train = train.dropna()
In [22]:
train.shape
Out[22]:
(26585, 76)
In [23]:
df = train[train['pct_own']>0.1]
df.shape
Out[23]:
(26215, 76)
In [24]:
df = df.sort_values(by='second_mortgage',ascending=False)
In [25]:
pd.set_option('display.max_columns', None)
df.head()
Out[25]:
COUNTYID STATEID state state_ab city place type zip_code area_code lat lng ALand AWater pop male_pop female_pop rent_mean rent_median rent_stdev rent_sample_weight rent_samples rent_gt_10 rent_gt_15 rent_gt_20 rent_gt_25 rent_gt_30 rent_gt_35 rent_gt_40 rent_gt_50 universe_samples used_samples hi_mean hi_median hi_stdev hi_sample_weight hi_samples family_mean family_median family_stdev family_sample_weight family_samples hc_mortgage_mean hc_mortgage_median hc_mortgage_stdev hc_mortgage_sample_weight hc_mortgage_samples hc_mean hc_median hc_stdev hc_samples hc_sample_weight home_equity_second_mortgage second_mortgage home_equity debt second_mortgage_cdf home_equity_cdf debt_cdf hs_degree hs_degree_male hs_degree_female male_age_mean male_age_median male_age_stdev male_age_sample_weight male_age_samples female_age_mean female_age_median female_age_stdev female_age_sample_weight female_age_samples pct_own married married_snp separated divorced
UID
289712 147 51 Virginia VA Farmville Farmville Town 23901 434 37.297357 -78.396452 413391.0 0 1733 609 1124 782.00000 781.0 22.95830 11.00000 11.0 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 0.00000 11 11 33088.92156 23236.0 19970.41249 16.33316 19.0 47067.92731 59954.0 24030.19608 5.33316 8.0 2249.50000 2249.0 182.57419 0.79359 4.0 749.50000 749.0 36.51484 4.0 1.97980 0.00000 0.50000 0.00000 0.50000 0.00067 1.00000 0.77776 1.00000 1.00000 1.00000 21.33803 19.25000 9.50021 364.20985 609.0 19.58762 19.16667 4.00258 673.39577 1124.0 0.62069 0.03612 0.01806 0.01806 0.00000
251185 27 25 Massachusetts MA Worcester Worcester City City 1610 508 42.254262 -71.800347 797165.0 0 2133 1139 994 942.32740 953.0 304.34109 333.88019 645.0 0.98906 0.97813 0.86250 0.81563 0.68438 0.53281 0.47500 0.39063 655 640 39036.18368 29037.0 42317.65457 599.87224 768.0 50471.95789 40476.0 45794.28515 314.09134 432.0 1596.15811 1690.0 465.71234 30.05003 96.0 589.73200 528.0 198.18324 17.0 10.43434 0.43363 0.43363 0.43363 0.84956 0.00100 0.00468 0.08684 0.71803 0.68883 0.75828 30.99146 30.75000 18.15286 255.90977 1139.0 30.60147 26.16667 19.21553 262.09529 994.0 0.20247 0.37844 0.11976 0.09341 0.10539
269323 81 36 New York NY Corona Harbor Hills City 11368 718 40.751809 -73.853582 169666.0 0 4181 2249 1932 1413.12357 1388.0 499.47343 205.65925 777.0 1.00000 0.92664 0.80952 0.69241 0.58301 0.44659 0.37967 0.30245 821 777 56434.63436 46106.0 47279.53535 674.74625 997.0 48558.91165 40462.0 35569.90113 630.41529 878.0 3037.81395 3320.0 888.70919 29.17150 138.0 751.81483 894.0 269.48263 38.0 23.35354 0.31818 0.31818 0.40341 0.78409 0.00241 0.00638 0.18540 0.58739 0.61499 0.55192 30.09851 29.58333 18.22005 483.12831 2249.0 29.80694 27.66667 18.45616 448.69061 1932.0 0.15618 0.44490 0.14555 0.02357 0.04066
251324 3 24 Maryland MD Glen Burnie Glen Burnie CDP 21061 410 39.127273 -76.635265 1110282.0 0 4866 1985 2881 1335.49818 1335.0 336.92824 352.62444 1502.0 1.00000 0.91545 0.77763 0.60186 0.49001 0.42011 0.38016 0.23435 1502 1502 59466.62302 50164.0 37351.26266 1293.31194 2068.0 64899.68626 50705.0 39727.56212 706.84520 1125.0 1622.29005 1520.0 511.53797 156.43774 496.0 452.77058 509.0 165.06276 70.0 49.29293 0.27739 0.30212 0.35689 0.87633 0.00289 0.01131 0.05915 0.86185 0.85294 0.86732 29.07276 27.41667 19.97922 475.95730 1985.0 32.53273 30.66667 19.61959 694.10357 2881.0 0.22380 0.58250 0.08321 0.00000 0.01778
235788 57 12 Florida FL Tampa Egypt Lake-leto City 33614 813 28.029063 -82.495395 2050906.0 234794 5468 2784 2684 914.10322 880.0 191.64962 1067.77502 1847.0 0.96619 0.92794 0.80820 0.58925 0.44235 0.37140 0.22783 0.17350 1965 1804 48495.17313 38340.0 41137.53473 1664.02791 2179.0 52332.06236 39980.0 41386.75431 755.11681 1010.0 1641.00508 1462.0 774.11061 81.16409 169.0 446.96166 404.0 86.60735 45.0 34.89899 0.28972 0.28972 0.38785 0.78972 0.00324 0.00770 0.17505 0.92809 0.93188 0.92375 31.39914 29.08333 16.25854 613.84520 2784.0 34.53924 28.58333 18.56943 814.45000 2684.0 0.11618 0.36953 0.12876 0.09957 0.07339
In [26]:
top_2500_second_mortgage_pctown_10 = df.head(2500)
top_2500_second_mortgage_pctown_10
Out[26]:
COUNTYID STATEID state state_ab city place type zip_code area_code lat lng ALand AWater pop male_pop female_pop rent_mean rent_median rent_stdev rent_sample_weight rent_samples rent_gt_10 rent_gt_15 rent_gt_20 rent_gt_25 rent_gt_30 rent_gt_35 rent_gt_40 rent_gt_50 universe_samples used_samples hi_mean hi_median hi_stdev hi_sample_weight hi_samples family_mean family_median family_stdev family_sample_weight family_samples hc_mortgage_mean hc_mortgage_median hc_mortgage_stdev hc_mortgage_sample_weight hc_mortgage_samples hc_mean hc_median hc_stdev hc_samples hc_sample_weight home_equity_second_mortgage second_mortgage home_equity debt second_mortgage_cdf home_equity_cdf debt_cdf hs_degree hs_degree_male hs_degree_female male_age_mean male_age_median male_age_stdev male_age_sample_weight male_age_samples female_age_mean female_age_median female_age_stdev female_age_sample_weight female_age_samples pct_own married married_snp separated divorced
UID
289712 147 51 Virginia VA Farmville Farmville Town 23901 434 37.297357 -78.396452 413391.0 0 1733 609 1124 782.00000 781.0 22.95830 11.00000 11.0 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 0.00000 11 11 33088.92156 23236.0 19970.41249 16.33316 19.0 47067.92731 59954.0 24030.19608 5.33316 8.0 2249.50000 2249.0 182.57419 0.79359 4.0 749.50000 749.0 36.51484 4.0 1.97980 0.00000 0.50000 0.00000 0.50000 0.00067 1.00000 0.77776 1.00000 1.00000 1.00000 21.33803 19.25000 9.50021 364.20985 609.0 19.58762 19.16667 4.00258 673.39577 1124.0 0.62069 0.03612 0.01806 0.01806 0.00000
251185 27 25 Massachusetts MA Worcester Worcester City City 1610 508 42.254262 -71.800347 797165.0 0 2133 1139 994 942.32740 953.0 304.34109 333.88019 645.0 0.98906 0.97813 0.86250 0.81563 0.68438 0.53281 0.47500 0.39063 655 640 39036.18368 29037.0 42317.65457 599.87224 768.0 50471.95789 40476.0 45794.28515 314.09134 432.0 1596.15811 1690.0 465.71234 30.05003 96.0 589.73200 528.0 198.18324 17.0 10.43434 0.43363 0.43363 0.43363 0.84956 0.00100 0.00468 0.08684 0.71803 0.68883 0.75828 30.99146 30.75000 18.15286 255.90977 1139.0 30.60147 26.16667 19.21553 262.09529 994.0 0.20247 0.37844 0.11976 0.09341 0.10539
269323 81 36 New York NY Corona Harbor Hills City 11368 718 40.751809 -73.853582 169666.0 0 4181 2249 1932 1413.12357 1388.0 499.47343 205.65925 777.0 1.00000 0.92664 0.80952 0.69241 0.58301 0.44659 0.37967 0.30245 821 777 56434.63436 46106.0 47279.53535 674.74625 997.0 48558.91165 40462.0 35569.90113 630.41529 878.0 3037.81395 3320.0 888.70919 29.17150 138.0 751.81483 894.0 269.48263 38.0 23.35354 0.31818 0.31818 0.40341 0.78409 0.00241 0.00638 0.18540 0.58739 0.61499 0.55192 30.09851 29.58333 18.22005 483.12831 2249.0 29.80694 27.66667 18.45616 448.69061 1932.0 0.15618 0.44490 0.14555 0.02357 0.04066
251324 3 24 Maryland MD Glen Burnie Glen Burnie CDP 21061 410 39.127273 -76.635265 1110282.0 0 4866 1985 2881 1335.49818 1335.0 336.92824 352.62444 1502.0 1.00000 0.91545 0.77763 0.60186 0.49001 0.42011 0.38016 0.23435 1502 1502 59466.62302 50164.0 37351.26266 1293.31194 2068.0 64899.68626 50705.0 39727.56212 706.84520 1125.0 1622.29005 1520.0 511.53797 156.43774 496.0 452.77058 509.0 165.06276 70.0 49.29293 0.27739 0.30212 0.35689 0.87633 0.00289 0.01131 0.05915 0.86185 0.85294 0.86732 29.07276 27.41667 19.97922 475.95730 1985.0 32.53273 30.66667 19.61959 694.10357 2881.0 0.22380 0.58250 0.08321 0.00000 0.01778
235788 57 12 Florida FL Tampa Egypt Lake-leto City 33614 813 28.029063 -82.495395 2050906.0 234794 5468 2784 2684 914.10322 880.0 191.64962 1067.77502 1847.0 0.96619 0.92794 0.80820 0.58925 0.44235 0.37140 0.22783 0.17350 1965 1804 48495.17313 38340.0 41137.53473 1664.02791 2179.0 52332.06236 39980.0 41386.75431 755.11681 1010.0 1641.00508 1462.0 774.11061 81.16409 169.0 446.96166 404.0 86.60735 45.0 34.89899 0.28972 0.28972 0.38785 0.78972 0.00324 0.00770 0.17505 0.92809 0.93188 0.92375 31.39914 29.08333 16.25854 613.84520 2784.0 34.53924 28.58333 18.56943 814.45000 2684.0 0.11618 0.36953 0.12876 0.09957 0.07339
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
230480 77 6 California CA Manteca Manteca City City 95336 209 37.732143 -121.242902 99716769.0 1333851 6158 3063 3095 1301.87928 1260.0 607.57824 182.65634 412.0 0.94802 0.85644 0.78218 0.58663 0.42574 0.31931 0.24505 0.21040 497 404 93433.24940 74648.0 69348.40202 769.38949 1715.0 93602.43101 76881.0 62539.58165 615.59620 1432.0 2503.93443 2342.0 880.19803 198.44275 866.0 518.47460 466.0 282.26273 352.0 247.45455 0.06814 0.06814 0.16831 0.71100 0.12884 0.18870 0.34214 0.80677 0.80107 0.81224 36.60759 37.75000 22.99172 761.94470 3063.0 37.42113 38.83333 22.82683 744.85694 3095.0 0.67116 0.62787 0.06491 0.01817 0.04890
236853 86 12 Florida FL Cutler Bay Cutler Bay City 33189 305 25.550391 -80.347791 5973763.0 303353 6082 2846 3236 1171.16929 1143.0 467.51572 309.85627 1014.0 0.98108 0.94622 0.91434 0.77689 0.67032 0.57570 0.42430 0.27390 1014 1004 64840.46787 50832.0 46815.09612 1091.80330 1836.0 68306.55021 52547.0 44894.90566 846.17039 1432.0 1953.37474 1868.0 682.94572 184.00089 695.0 705.60640 695.0 126.68476 127.0 62.85859 0.06813 0.06813 0.10341 0.84550 0.12885 0.42634 0.09151 0.89805 0.94937 0.85839 28.14934 27.66667 17.70953 688.37319 2846.0 32.61418 31.41667 21.11881 771.15796 3236.0 0.50519 0.45658 0.04215 0.02133 0.11326
287458 439 48 Texas TX Keller Keller City Town 76248 817 32.913822 -97.204310 5402229.0 39076 6055 2948 3107 3568.54429 3791.0 612.08579 1.47118 24.0 1.00000 1.00000 0.75000 0.75000 0.75000 0.75000 0.75000 0.75000 24 24 180041.16860 177847.0 81585.93842 287.28619 1771.0 171782.58200 177067.0 68036.94142 286.46390 1695.0 2931.45288 2910.0 892.58827 293.90251 1469.0 1112.85983 1079.0 393.10459 278.0 110.05742 0.06125 0.06812 0.13623 0.84087 0.12891 0.29125 0.09709 0.96534 0.96353 0.96696 37.08122 42.16667 22.02195 687.23252 2948.0 37.81384 41.83333 22.86724 675.21689 3107.0 0.97987 0.76007 0.02131 0.00000 0.01714
271050 17 39 Ohio OH Middletown Jacksonburg Village 45042 513 39.556756 -84.443252 57770725.0 320075 4624 2242 2382 1034.18493 918.0 429.00225 57.24333 100.0 1.00000 0.88172 0.79570 0.52688 0.52688 0.34409 0.34409 0.25806 110 93 81413.04207 72585.0 54780.34544 736.53852 1578.0 85271.17968 77338.0 48963.94309 571.79591 1331.0 1610.44944 1475.0 700.78367 409.37231 1071.0 610.69520 555.0 257.80158 397.0 243.46465 0.06812 0.06812 0.19482 0.72956 0.12887 0.12674 0.29882 0.90630 0.90186 0.91071 43.07611 46.66667 21.89177 567.31229 2242.0 41.82758 44.00000 22.63359 553.13538 2382.0 0.92888 0.64182 0.01346 0.01346 0.12164
262770 1 34 New Jersey NJ Mays Landing Mays Landing City 8330 609 39.432879 -74.686137 35788759.0 941328 10140 4931 5209 1388.74715 1268.0 568.73713 256.56181 1010.0 1.00000 0.98390 0.92958 0.91650 0.85815 0.71529 0.67103 0.56137 1022 994 70990.60337 52393.0 60284.93812 2085.06896 3577.0 77103.45672 61947.0 59211.64578 1252.33218 2414.0 1897.19217 1771.0 731.71424 547.55028 1873.0 737.06836 727.0 367.04123 682.0 380.27220 0.05245 0.06810 0.13973 0.73307 0.12897 0.27864 0.29106 0.85941 0.85090 0.86751 37.92045 36.41667 22.06624 1186.78396 4931.0 40.04732 40.33333 22.32610 1271.13174 5209.0 0.70642 0.50818 0.05187 0.00383 0.08201

2500 rows × 76 columns

First getting data which has pct >0.1 then sorting it in descending order by second mortgage and filtering out top 2500.

In [27]:
import plotly.express as px
import plotly.graph_objects as go
In [28]:
fig = go.Figure(data=go.Scattergeo(
    lat = top_2500_second_mortgage_pctown_10['lat'],
    lon = top_2500_second_mortgage_pctown_10['lng']),
    )
fig.update_layout(
    geo=dict(
        scope = 'north america',
        showland = True,
        landcolor = "rgb(212, 212, 212)",
        subunitcolor = "rgb(255, 255, 255)",
        countrycolor = "rgb(255, 255, 255)",
        showlakes = True,
        lakecolor = "rgb(255, 255, 255)",
        showsubunits = True,
        showcountries = True,
        resolution = 50,
        projection = dict(
            type = 'conic conformal',
            rotation_lon = -100
        ),
        lonaxis = dict(
            showgrid = True,
            gridwidth = 0.5,
            range= [ -140.0, -55.0 ],
            dtick = 5
        ),
        lataxis = dict (
            showgrid = True,
            gridwidth = 0.5,
            range= [ 20.0, 60.0 ],
            dtick = 5
        )
    ),
    title='Top 2,500 locations with second mortgage is the highest and percent ownership is above 10 percent')
fig.show()
In [29]:
train['bad_debt'] = train['second_mortgage'] + train['home_equity'] - train['home_equity_second_mortgage']
In [30]:
train.head()
Out[30]:
COUNTYID STATEID state state_ab city place type zip_code area_code lat lng ALand AWater pop male_pop female_pop rent_mean rent_median rent_stdev rent_sample_weight rent_samples rent_gt_10 rent_gt_15 rent_gt_20 rent_gt_25 rent_gt_30 rent_gt_35 rent_gt_40 rent_gt_50 universe_samples used_samples hi_mean hi_median hi_stdev hi_sample_weight hi_samples family_mean family_median family_stdev family_sample_weight family_samples hc_mortgage_mean hc_mortgage_median hc_mortgage_stdev hc_mortgage_sample_weight hc_mortgage_samples hc_mean hc_median hc_stdev hc_samples hc_sample_weight home_equity_second_mortgage second_mortgage home_equity debt second_mortgage_cdf home_equity_cdf debt_cdf hs_degree hs_degree_male hs_degree_female male_age_mean male_age_median male_age_stdev male_age_sample_weight male_age_samples female_age_mean female_age_median female_age_stdev female_age_sample_weight female_age_samples pct_own married married_snp separated divorced bad_debt
UID
267822 53 36 New York NY Hamilton Hamilton City 13346 315 42.840812 -75.501524 202183361.0 1699120 5230 2612 2618 769.38638 784.0 232.63967 272.34441 362.0 0.86761 0.79155 0.59155 0.45634 0.42817 0.18592 0.15493 0.12958 387 355 63125.28406 48120.0 49042.01206 1290.96240 2024.0 67994.14790 53245.0 47667.30119 884.33516 1491.0 1414.80295 1223.0 641.22898 377.83135 867.0 570.01530 558.0 270.11299 770.0 499.29293 0.01588 0.02077 0.08919 0.52963 0.43658 0.49087 0.73341 0.89288 0.85880 0.92434 42.48574 44.00000 22.97306 696.42136 2612.0 44.48629 45.33333 22.51276 685.33845 2618.0 0.79046 0.57851 0.01882 0.01240 0.08770 0.09408
246444 141 18 Indiana IN South Bend Roseland City 46616 574 41.701441 -86.266614 1560828.0 100363 2633 1349 1284 804.87924 848.0 253.46747 312.58622 513.0 0.97410 0.93227 0.69920 0.69920 0.55179 0.41235 0.39044 0.27888 542 502 41931.92593 35186.0 31639.50203 838.74664 1127.0 50670.10337 43023.0 34715.57548 375.28798 554.0 864.41390 784.0 482.27020 316.88320 356.0 351.98293 336.0 125.40457 229.0 189.60606 0.02222 0.02222 0.04274 0.60855 0.42174 0.70823 0.58120 0.90487 0.86947 0.94187 34.84728 32.00000 20.37452 323.90204 1349.0 36.48391 37.58333 23.43353 267.23367 1284.0 0.52483 0.34886 0.01426 0.01426 0.09030 0.04274
245683 63 18 Indiana IN Danville Danville City 46122 317 39.792202 -86.515246 69561595.0 284193 6881 3643 3238 742.77365 703.0 323.39011 291.85520 378.0 0.95238 0.88624 0.79630 0.66667 0.39153 0.39153 0.28307 0.15873 459 378 84942.68317 74964.0 56811.62186 1155.20980 2488.0 95262.51431 85395.0 49292.67664 709.74925 1889.0 1506.06758 1361.0 731.89394 699.41354 1491.0 556.45986 532.0 184.42175 538.0 323.35354 0.00000 0.00000 0.09512 0.73484 1.00000 0.46332 0.28704 0.94288 0.94616 0.93952 39.38154 40.83333 22.89769 888.29730 3643.0 42.15810 42.83333 23.94119 707.01963 3238.0 0.85331 0.64745 0.02830 0.01607 0.10657 0.09512
279653 127 72 Puerto Rico PR San Juan Guaynabo Urban 927 787 18.396103 -66.104169 1105793.0 0 2700 1141 1559 803.42018 782.0 297.39258 259.30316 368.0 0.94693 0.87151 0.69832 0.61732 0.51397 0.46927 0.35754 0.32961 438 358 48733.67116 37845.0 45100.54010 928.32193 1267.0 56401.68133 44399.0 41082.90515 490.18479 729.0 1175.28642 1101.0 428.98751 261.28471 437.0 288.04047 247.0 185.55887 392.0 314.90566 0.01086 0.01086 0.01086 0.52714 0.53057 0.82530 0.73727 0.91500 0.90755 0.92043 48.64749 48.91667 23.05968 274.98956 1141.0 47.77526 50.58333 24.32015 362.20193 1559.0 0.65037 0.47257 0.02021 0.02021 0.10106 0.01086
247218 161 20 Kansas KS Manhattan Manhattan City City 66502 785 39.195573 -96.569366 2554403.0 0 5637 2586 3051 938.56493 881.0 392.44096 1005.42886 1704.0 0.99286 0.98247 0.91688 0.84740 0.78247 0.60974 0.55455 0.44416 1725 1540 31834.15466 22497.0 34046.50907 1548.67477 1983.0 54053.42396 50272.0 39609.12605 244.08903 395.0 1192.58759 1125.0 327.49674 76.61052 134.0 443.68855 444.0 76.12674 124.0 79.55556 0.05426 0.05426 0.05426 0.51938 0.18332 0.65545 0.74967 1.00000 1.00000 1.00000 26.07533 22.41667 11.84399 1296.89877 2586.0 24.17693 21.58333 11.10484 1854.48652 3051.0 0.13046 0.12356 0.00000 0.00000 0.03109 0.05426
In [31]:
train['bins_bad_debt'] = pd.cut(train['bad_debt'],bins=[0,0.1,.5,1], labels=["less than 10%","10-50%","50-100%"])
train.groupby(['bins_bad_debt']).size().plot(kind='pie',subplots=True,startangle=90, autopct='%1.1f%%')
plt.title('Bad Debt pct')
plt.ylabel("")

plt.show()
In [32]:
train['bins_debt'] = pd.cut(train['debt'],bins=[0,0.1,.5,1], labels=["less than 10%","10-50%","50-100%"])
train.groupby(['bins_debt']).size().plot(kind='pie',subplots=True,startangle=90, autopct='%1.1f%%')
plt.title('Debt pct')
plt.ylabel("")

plt.show()
In [33]:
train.head()
Out[33]:
COUNTYID STATEID state state_ab city place type zip_code area_code lat lng ALand AWater pop male_pop female_pop rent_mean rent_median rent_stdev rent_sample_weight rent_samples rent_gt_10 rent_gt_15 rent_gt_20 rent_gt_25 rent_gt_30 rent_gt_35 rent_gt_40 rent_gt_50 universe_samples used_samples hi_mean hi_median hi_stdev hi_sample_weight hi_samples family_mean family_median family_stdev family_sample_weight family_samples hc_mortgage_mean hc_mortgage_median hc_mortgage_stdev hc_mortgage_sample_weight hc_mortgage_samples hc_mean hc_median hc_stdev hc_samples hc_sample_weight home_equity_second_mortgage second_mortgage home_equity debt second_mortgage_cdf home_equity_cdf debt_cdf hs_degree hs_degree_male hs_degree_female male_age_mean male_age_median male_age_stdev male_age_sample_weight male_age_samples female_age_mean female_age_median female_age_stdev female_age_sample_weight female_age_samples pct_own married married_snp separated divorced bad_debt bins_bad_debt bins_debt
UID
267822 53 36 New York NY Hamilton Hamilton City 13346 315 42.840812 -75.501524 202183361.0 1699120 5230 2612 2618 769.38638 784.0 232.63967 272.34441 362.0 0.86761 0.79155 0.59155 0.45634 0.42817 0.18592 0.15493 0.12958 387 355 63125.28406 48120.0 49042.01206 1290.96240 2024.0 67994.14790 53245.0 47667.30119 884.33516 1491.0 1414.80295 1223.0 641.22898 377.83135 867.0 570.01530 558.0 270.11299 770.0 499.29293 0.01588 0.02077 0.08919 0.52963 0.43658 0.49087 0.73341 0.89288 0.85880 0.92434 42.48574 44.00000 22.97306 696.42136 2612.0 44.48629 45.33333 22.51276 685.33845 2618.0 0.79046 0.57851 0.01882 0.01240 0.08770 0.09408 less than 10% 50-100%
246444 141 18 Indiana IN South Bend Roseland City 46616 574 41.701441 -86.266614 1560828.0 100363 2633 1349 1284 804.87924 848.0 253.46747 312.58622 513.0 0.97410 0.93227 0.69920 0.69920 0.55179 0.41235 0.39044 0.27888 542 502 41931.92593 35186.0 31639.50203 838.74664 1127.0 50670.10337 43023.0 34715.57548 375.28798 554.0 864.41390 784.0 482.27020 316.88320 356.0 351.98293 336.0 125.40457 229.0 189.60606 0.02222 0.02222 0.04274 0.60855 0.42174 0.70823 0.58120 0.90487 0.86947 0.94187 34.84728 32.00000 20.37452 323.90204 1349.0 36.48391 37.58333 23.43353 267.23367 1284.0 0.52483 0.34886 0.01426 0.01426 0.09030 0.04274 less than 10% 50-100%
245683 63 18 Indiana IN Danville Danville City 46122 317 39.792202 -86.515246 69561595.0 284193 6881 3643 3238 742.77365 703.0 323.39011 291.85520 378.0 0.95238 0.88624 0.79630 0.66667 0.39153 0.39153 0.28307 0.15873 459 378 84942.68317 74964.0 56811.62186 1155.20980 2488.0 95262.51431 85395.0 49292.67664 709.74925 1889.0 1506.06758 1361.0 731.89394 699.41354 1491.0 556.45986 532.0 184.42175 538.0 323.35354 0.00000 0.00000 0.09512 0.73484 1.00000 0.46332 0.28704 0.94288 0.94616 0.93952 39.38154 40.83333 22.89769 888.29730 3643.0 42.15810 42.83333 23.94119 707.01963 3238.0 0.85331 0.64745 0.02830 0.01607 0.10657 0.09512 less than 10% 50-100%
279653 127 72 Puerto Rico PR San Juan Guaynabo Urban 927 787 18.396103 -66.104169 1105793.0 0 2700 1141 1559 803.42018 782.0 297.39258 259.30316 368.0 0.94693 0.87151 0.69832 0.61732 0.51397 0.46927 0.35754 0.32961 438 358 48733.67116 37845.0 45100.54010 928.32193 1267.0 56401.68133 44399.0 41082.90515 490.18479 729.0 1175.28642 1101.0 428.98751 261.28471 437.0 288.04047 247.0 185.55887 392.0 314.90566 0.01086 0.01086 0.01086 0.52714 0.53057 0.82530 0.73727 0.91500 0.90755 0.92043 48.64749 48.91667 23.05968 274.98956 1141.0 47.77526 50.58333 24.32015 362.20193 1559.0 0.65037 0.47257 0.02021 0.02021 0.10106 0.01086 less than 10% 50-100%
247218 161 20 Kansas KS Manhattan Manhattan City City 66502 785 39.195573 -96.569366 2554403.0 0 5637 2586 3051 938.56493 881.0 392.44096 1005.42886 1704.0 0.99286 0.98247 0.91688 0.84740 0.78247 0.60974 0.55455 0.44416 1725 1540 31834.15466 22497.0 34046.50907 1548.67477 1983.0 54053.42396 50272.0 39609.12605 244.08903 395.0 1192.58759 1125.0 327.49674 76.61052 134.0 443.68855 444.0 76.12674 124.0 79.55556 0.05426 0.05426 0.05426 0.51938 0.18332 0.65545 0.74967 1.00000 1.00000 1.00000 26.07533 22.41667 11.84399 1296.89877 2586.0 24.17693 21.58333 11.10484 1854.48652 3051.0 0.13046 0.12356 0.00000 0.00000 0.03109 0.05426 less than 10% 50-100%
In [34]:
city_list = train['city'].value_counts()
city_list
Out[34]:
Chicago            286
Brooklyn           261
Los Angeles        219
Houston            213
Philadelphia       160
                  ... 
Fort Atkinson        1
Kirksville           1
Calera               1
Mc Connelsville      1
Richton Park         1
Name: city, Length: 6876, dtype: int64
In [35]:
city_list_10 = city_list[0:10]
city_list_10
Out[35]:
Chicago         286
Brooklyn        261
Los Angeles     219
Houston         213
Philadelphia    160
San Antonio     138
Baltimore       128
Las Vegas       123
Phoenix         114
Miami           105
Name: city, dtype: int64
In [36]:
city_list_10.index
Out[36]:
Index(['Chicago', 'Brooklyn', 'Los Angeles', 'Houston', 'Philadelphia',
       'San Antonio', 'Baltimore', 'Las Vegas', 'Phoenix', 'Miami'],
      dtype='object')
In [37]:
train['good_debt'] = 1 - train['bad_debt']
print(train.shape)
train.head()
(26585, 80)
Out[37]:
COUNTYID STATEID state state_ab city place type zip_code area_code lat lng ALand AWater pop male_pop female_pop rent_mean rent_median rent_stdev rent_sample_weight rent_samples rent_gt_10 rent_gt_15 rent_gt_20 rent_gt_25 rent_gt_30 rent_gt_35 rent_gt_40 rent_gt_50 universe_samples used_samples hi_mean hi_median hi_stdev hi_sample_weight hi_samples family_mean family_median family_stdev family_sample_weight family_samples hc_mortgage_mean hc_mortgage_median hc_mortgage_stdev hc_mortgage_sample_weight hc_mortgage_samples hc_mean hc_median hc_stdev hc_samples hc_sample_weight home_equity_second_mortgage second_mortgage home_equity debt second_mortgage_cdf home_equity_cdf debt_cdf hs_degree hs_degree_male hs_degree_female male_age_mean male_age_median male_age_stdev male_age_sample_weight male_age_samples female_age_mean female_age_median female_age_stdev female_age_sample_weight female_age_samples pct_own married married_snp separated divorced bad_debt bins_bad_debt bins_debt good_debt
UID
267822 53 36 New York NY Hamilton Hamilton City 13346 315 42.840812 -75.501524 202183361.0 1699120 5230 2612 2618 769.38638 784.0 232.63967 272.34441 362.0 0.86761 0.79155 0.59155 0.45634 0.42817 0.18592 0.15493 0.12958 387 355 63125.28406 48120.0 49042.01206 1290.96240 2024.0 67994.14790 53245.0 47667.30119 884.33516 1491.0 1414.80295 1223.0 641.22898 377.83135 867.0 570.01530 558.0 270.11299 770.0 499.29293 0.01588 0.02077 0.08919 0.52963 0.43658 0.49087 0.73341 0.89288 0.85880 0.92434 42.48574 44.00000 22.97306 696.42136 2612.0 44.48629 45.33333 22.51276 685.33845 2618.0 0.79046 0.57851 0.01882 0.01240 0.08770 0.09408 less than 10% 50-100% 0.90592
246444 141 18 Indiana IN South Bend Roseland City 46616 574 41.701441 -86.266614 1560828.0 100363 2633 1349 1284 804.87924 848.0 253.46747 312.58622 513.0 0.97410 0.93227 0.69920 0.69920 0.55179 0.41235 0.39044 0.27888 542 502 41931.92593 35186.0 31639.50203 838.74664 1127.0 50670.10337 43023.0 34715.57548 375.28798 554.0 864.41390 784.0 482.27020 316.88320 356.0 351.98293 336.0 125.40457 229.0 189.60606 0.02222 0.02222 0.04274 0.60855 0.42174 0.70823 0.58120 0.90487 0.86947 0.94187 34.84728 32.00000 20.37452 323.90204 1349.0 36.48391 37.58333 23.43353 267.23367 1284.0 0.52483 0.34886 0.01426 0.01426 0.09030 0.04274 less than 10% 50-100% 0.95726
245683 63 18 Indiana IN Danville Danville City 46122 317 39.792202 -86.515246 69561595.0 284193 6881 3643 3238 742.77365 703.0 323.39011 291.85520 378.0 0.95238 0.88624 0.79630 0.66667 0.39153 0.39153 0.28307 0.15873 459 378 84942.68317 74964.0 56811.62186 1155.20980 2488.0 95262.51431 85395.0 49292.67664 709.74925 1889.0 1506.06758 1361.0 731.89394 699.41354 1491.0 556.45986 532.0 184.42175 538.0 323.35354 0.00000 0.00000 0.09512 0.73484 1.00000 0.46332 0.28704 0.94288 0.94616 0.93952 39.38154 40.83333 22.89769 888.29730 3643.0 42.15810 42.83333 23.94119 707.01963 3238.0 0.85331 0.64745 0.02830 0.01607 0.10657 0.09512 less than 10% 50-100% 0.90488
279653 127 72 Puerto Rico PR San Juan Guaynabo Urban 927 787 18.396103 -66.104169 1105793.0 0 2700 1141 1559 803.42018 782.0 297.39258 259.30316 368.0 0.94693 0.87151 0.69832 0.61732 0.51397 0.46927 0.35754 0.32961 438 358 48733.67116 37845.0 45100.54010 928.32193 1267.0 56401.68133 44399.0 41082.90515 490.18479 729.0 1175.28642 1101.0 428.98751 261.28471 437.0 288.04047 247.0 185.55887 392.0 314.90566 0.01086 0.01086 0.01086 0.52714 0.53057 0.82530 0.73727 0.91500 0.90755 0.92043 48.64749 48.91667 23.05968 274.98956 1141.0 47.77526 50.58333 24.32015 362.20193 1559.0 0.65037 0.47257 0.02021 0.02021 0.10106 0.01086 less than 10% 50-100% 0.98914
247218 161 20 Kansas KS Manhattan Manhattan City City 66502 785 39.195573 -96.569366 2554403.0 0 5637 2586 3051 938.56493 881.0 392.44096 1005.42886 1704.0 0.99286 0.98247 0.91688 0.84740 0.78247 0.60974 0.55455 0.44416 1725 1540 31834.15466 22497.0 34046.50907 1548.67477 1983.0 54053.42396 50272.0 39609.12605 244.08903 395.0 1192.58759 1125.0 327.49674 76.61052 134.0 443.68855 444.0 76.12674 124.0 79.55556 0.05426 0.05426 0.05426 0.51938 0.18332 0.65545 0.74967 1.00000 1.00000 1.00000 26.07533 22.41667 11.84399 1296.89877 2586.0 24.17693 21.58333 11.10484 1854.48652 3051.0 0.13046 0.12356 0.00000 0.00000 0.03109 0.05426 less than 10% 50-100% 0.94574
In [38]:
train_subset = train[['city','second_mortgage','home_equity','good_debt','bad_debt']]
train_subset.head()
Out[38]:
city second_mortgage home_equity good_debt bad_debt
UID
267822 Hamilton 0.02077 0.08919 0.90592 0.09408
246444 South Bend 0.02222 0.04274 0.95726 0.04274
245683 Danville 0.00000 0.09512 0.90488 0.09512
279653 San Juan 0.01086 0.01086 0.98914 0.01086
247218 Manhattan 0.05426 0.05426 0.94574 0.05426
In [39]:
train_subset_city = train_subset[train_subset['city'].isin(city_list_10.index)]
train_subset_city['city'].value_counts()
Out[39]:
Chicago         286
Brooklyn        261
Los Angeles     219
Houston         213
Philadelphia    160
San Antonio     138
Baltimore       128
Las Vegas       123
Phoenix         114
Miami           105
Name: city, dtype: int64
In [40]:
train_subset_city.head()
Out[40]:
city second_mortgage home_equity good_debt bad_debt
UID
225738 Los Angeles 0.00000 0.00000 1.00000 0.00000
267431 Brooklyn 0.00000 0.06220 0.93780 0.06220
265422 Las Vegas 0.06256 0.19662 0.80338 0.19662
278415 Philadelphia 0.00000 0.03699 0.96301 0.03699
283137 San Antonio 0.00000 0.01838 0.98162 0.01838
In [41]:
plt.subplots(figsize=(12,8))
ax = sns.boxplot(x = 'city',y = 'second_mortgage',data = train_subset_city)
plt.title('city vs second mortgage')
plt.show()
In [42]:
plt.subplots(figsize=(12,8))
ax = sns.boxplot(x = 'city',y = 'home_equity',data = train_subset_city)
plt.title('city vs home equity')
plt.show()
In [43]:
plt.subplots(figsize=(12,8))
ax = sns.boxplot(x = 'city',y = 'bad_debt',data = train_subset_city)
plt.title('city vs bad debt')
plt.show()
In [44]:
plt.subplots(figsize=(12,8))
ax = sns.boxplot(x = 'city',y = 'good_debt',data = train_subset_city)
plt.title('city vs good debt')
plt.show()
In [45]:
train['remaining_income'] = train['family_median'] - train['hi_median']
In [46]:
plt.subplots(figsize=(10,12))
ax = sns.boxplot(data = train[['family_mean', 'hi_mean', 'remaining_income']])
plt.title('Collated Mean Income distribution ')
plt.show()
In [47]:
plt.subplots(figsize=(10,12))
ax = sns.boxplot(data = train[['family_median', 'hi_median', 'remaining_income']])
plt.title('Collated Median Income distribution ')
plt.show()
In [48]:
train_density_eda = train[['pop','ALand','male_age_median','female_age_median','male_pop','female_pop']]
train_density_eda.head()
Out[48]:
pop ALand male_age_median female_age_median male_pop female_pop
UID
267822 5230 202183361.0 44.00000 45.33333 2612 2618
246444 2633 1560828.0 32.00000 37.58333 1349 1284
245683 6881 69561595.0 40.83333 42.83333 3643 3238
279653 2700 1105793.0 48.91667 50.58333 1141 1559
247218 5637 2554403.0 22.41667 21.58333 2586 3051
In [49]:
train_density_eda['pop_density'] = train_density_eda['pop'] / train_density_eda['ALand']
train_density_eda.head()
/usr/local/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

Out[49]:
pop ALand male_age_median female_age_median male_pop female_pop pop_density
UID
267822 5230 202183361.0 44.00000 45.33333 2612 2618 0.000026
246444 2633 1560828.0 32.00000 37.58333 1349 1284 0.001687
245683 6881 69561595.0 40.83333 42.83333 3643 3238 0.000099
279653 2700 1105793.0 48.91667 50.58333 1141 1559 0.002442
247218 5637 2554403.0 22.41667 21.58333 2586 3051 0.002207
In [50]:
train_density_eda['median_age'] = (train_density_eda['male_age_median'] *  train_density_eda['male_pop'] + train_density_eda['female_age_median'] *  train_density_eda['female_pop'])  / train_density_eda['pop']
train_density_eda.head()
/usr/local/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

Out[50]:
pop ALand male_age_median female_age_median male_pop female_pop pop_density median_age
UID
267822 5230 202183361.0 44.00000 45.33333 2612 2618 0.000026 44.667430
246444 2633 1560828.0 32.00000 37.58333 1349 1284 0.001687 34.722748
245683 6881 69561595.0 40.83333 42.83333 3643 3238 0.000099 41.774472
279653 2700 1105793.0 48.91667 50.58333 1141 1559 0.002442 49.879012
247218 5637 2554403.0 22.41667 21.58333 2586 3051 0.002207 21.965629
In [51]:
sns.distplot(train_density_eda['pop_density'])
plt.show()
In [52]:
sns.distplot(train_density_eda['median_age'])
plt.show()
In [53]:
age_df = train[['state','male_pop','female_pop','male_age_median','female_age_median','married','separated','divorced']]
age_df.head()
Out[53]:
state male_pop female_pop male_age_median female_age_median married separated divorced
UID
267822 New York 2612 2618 44.00000 45.33333 0.57851 0.01240 0.08770
246444 Indiana 1349 1284 32.00000 37.58333 0.34886 0.01426 0.09030
245683 Indiana 3643 3238 40.83333 42.83333 0.64745 0.01607 0.10657
279653 Puerto Rico 1141 1559 48.91667 50.58333 0.47257 0.02021 0.10106
247218 Kansas 2586 3051 22.41667 21.58333 0.12356 0.00000 0.03109
In [54]:
bins = [0,18,30,45,60,100]
labels = ['teenager','youth','middle_aged','little_old','old']
In [55]:
age_df['male_pop_bins'] = pd.cut(age_df['male_age_median'],bins=bins,labels=labels)
/usr/local/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [56]:
age_df.head()
Out[56]:
state male_pop female_pop male_age_median female_age_median married separated divorced male_pop_bins
UID
267822 New York 2612 2618 44.00000 45.33333 0.57851 0.01240 0.08770 middle_aged
246444 Indiana 1349 1284 32.00000 37.58333 0.34886 0.01426 0.09030 middle_aged
245683 Indiana 3643 3238 40.83333 42.83333 0.64745 0.01607 0.10657 middle_aged
279653 Puerto Rico 1141 1559 48.91667 50.58333 0.47257 0.02021 0.10106 little_old
247218 Kansas 2586 3051 22.41667 21.58333 0.12356 0.00000 0.03109 youth
In [57]:
age_df['female_pop_bins'] = pd.cut(age_df['female_age_median'],bins=bins,labels=labels)
/usr/local/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [58]:
age_df.head()
Out[58]:
state male_pop female_pop male_age_median female_age_median married separated divorced male_pop_bins female_pop_bins
UID
267822 New York 2612 2618 44.00000 45.33333 0.57851 0.01240 0.08770 middle_aged little_old
246444 Indiana 1349 1284 32.00000 37.58333 0.34886 0.01426 0.09030 middle_aged middle_aged
245683 Indiana 3643 3238 40.83333 42.83333 0.64745 0.01607 0.10657 middle_aged middle_aged
279653 Puerto Rico 1141 1559 48.91667 50.58333 0.47257 0.02021 0.10106 little_old little_old
247218 Kansas 2586 3051 22.41667 21.58333 0.12356 0.00000 0.03109 youth youth
In [59]:
plt.figure(figsize = (50, 10))
ax = sns.barplot(x = 'state', y = 'married', hue = 'male_pop_bins', data = age_df)
plt.xticks(rotation = 90)
ax.set_title('Married Male population by state', fontsize = 20)
plt.show()
In [60]:
plt.figure(figsize = (50, 10))
ax = sns.barplot(x = 'state', y = 'married', hue = 'female_pop_bins', data = age_df)
plt.xticks(rotation = 90)
ax.set_title('Married Female population by state', fontsize = 20)
plt.show()
In [61]:
plt.figure(figsize = (50, 10))
ax = sns.barplot(x = 'state', y = 'separated', hue = 'male_pop_bins', data = age_df)
plt.xticks(rotation = 90)
ax.set_title('Separated Male population by state', fontsize = 20)
plt.show()
In [62]:
plt.figure(figsize = (50, 10))
ax = sns.barplot(x = 'state', y = 'separated', hue = 'female_pop_bins', data = age_df)
plt.xticks(rotation = 90)
ax.set_title('Separated Female population by state', fontsize = 20)
plt.show()
In [63]:
plt.figure(figsize = (50, 10))
ax = sns.barplot(x = 'state', y = 'divorced', hue = 'male_pop_bins', data = age_df)
plt.xticks(rotation = 90)
ax.set_title('Divorced Male population by state', fontsize = 20)
plt.show()
In [64]:
plt.figure(figsize = (50, 10))
ax = sns.barplot(x = 'state', y = 'divorced', hue = 'female_pop_bins', data = age_df)
plt.xticks(rotation = 90)
ax.set_title('Divorced Female population by state', fontsize = 20)
plt.show()
In [65]:
train.head()
Out[65]:
COUNTYID STATEID state state_ab city place type zip_code area_code lat lng ALand AWater pop male_pop female_pop rent_mean rent_median rent_stdev rent_sample_weight rent_samples rent_gt_10 rent_gt_15 rent_gt_20 rent_gt_25 rent_gt_30 rent_gt_35 rent_gt_40 rent_gt_50 universe_samples used_samples hi_mean hi_median hi_stdev hi_sample_weight hi_samples family_mean family_median family_stdev family_sample_weight family_samples hc_mortgage_mean hc_mortgage_median hc_mortgage_stdev hc_mortgage_sample_weight hc_mortgage_samples hc_mean hc_median hc_stdev hc_samples hc_sample_weight home_equity_second_mortgage second_mortgage home_equity debt second_mortgage_cdf home_equity_cdf debt_cdf hs_degree hs_degree_male hs_degree_female male_age_mean male_age_median male_age_stdev male_age_sample_weight male_age_samples female_age_mean female_age_median female_age_stdev female_age_sample_weight female_age_samples pct_own married married_snp separated divorced bad_debt bins_bad_debt bins_debt good_debt remaining_income
UID
267822 53 36 New York NY Hamilton Hamilton City 13346 315 42.840812 -75.501524 202183361.0 1699120 5230 2612 2618 769.38638 784.0 232.63967 272.34441 362.0 0.86761 0.79155 0.59155 0.45634 0.42817 0.18592 0.15493 0.12958 387 355 63125.28406 48120.0 49042.01206 1290.96240 2024.0 67994.14790 53245.0 47667.30119 884.33516 1491.0 1414.80295 1223.0 641.22898 377.83135 867.0 570.01530 558.0 270.11299 770.0 499.29293 0.01588 0.02077 0.08919 0.52963 0.43658 0.49087 0.73341 0.89288 0.85880 0.92434 42.48574 44.00000 22.97306 696.42136 2612.0 44.48629 45.33333 22.51276 685.33845 2618.0 0.79046 0.57851 0.01882 0.01240 0.08770 0.09408 less than 10% 50-100% 0.90592 5125.0
246444 141 18 Indiana IN South Bend Roseland City 46616 574 41.701441 -86.266614 1560828.0 100363 2633 1349 1284 804.87924 848.0 253.46747 312.58622 513.0 0.97410 0.93227 0.69920 0.69920 0.55179 0.41235 0.39044 0.27888 542 502 41931.92593 35186.0 31639.50203 838.74664 1127.0 50670.10337 43023.0 34715.57548 375.28798 554.0 864.41390 784.0 482.27020 316.88320 356.0 351.98293 336.0 125.40457 229.0 189.60606 0.02222 0.02222 0.04274 0.60855 0.42174 0.70823 0.58120 0.90487 0.86947 0.94187 34.84728 32.00000 20.37452 323.90204 1349.0 36.48391 37.58333 23.43353 267.23367 1284.0 0.52483 0.34886 0.01426 0.01426 0.09030 0.04274 less than 10% 50-100% 0.95726 7837.0
245683 63 18 Indiana IN Danville Danville City 46122 317 39.792202 -86.515246 69561595.0 284193 6881 3643 3238 742.77365 703.0 323.39011 291.85520 378.0 0.95238 0.88624 0.79630 0.66667 0.39153 0.39153 0.28307 0.15873 459 378 84942.68317 74964.0 56811.62186 1155.20980 2488.0 95262.51431 85395.0 49292.67664 709.74925 1889.0 1506.06758 1361.0 731.89394 699.41354 1491.0 556.45986 532.0 184.42175 538.0 323.35354 0.00000 0.00000 0.09512 0.73484 1.00000 0.46332 0.28704 0.94288 0.94616 0.93952 39.38154 40.83333 22.89769 888.29730 3643.0 42.15810 42.83333 23.94119 707.01963 3238.0 0.85331 0.64745 0.02830 0.01607 0.10657 0.09512 less than 10% 50-100% 0.90488 10431.0
279653 127 72 Puerto Rico PR San Juan Guaynabo Urban 927 787 18.396103 -66.104169 1105793.0 0 2700 1141 1559 803.42018 782.0 297.39258 259.30316 368.0 0.94693 0.87151 0.69832 0.61732 0.51397 0.46927 0.35754 0.32961 438 358 48733.67116 37845.0 45100.54010 928.32193 1267.0 56401.68133 44399.0 41082.90515 490.18479 729.0 1175.28642 1101.0 428.98751 261.28471 437.0 288.04047 247.0 185.55887 392.0 314.90566 0.01086 0.01086 0.01086 0.52714 0.53057 0.82530 0.73727 0.91500 0.90755 0.92043 48.64749 48.91667 23.05968 274.98956 1141.0 47.77526 50.58333 24.32015 362.20193 1559.0 0.65037 0.47257 0.02021 0.02021 0.10106 0.01086 less than 10% 50-100% 0.98914 6554.0
247218 161 20 Kansas KS Manhattan Manhattan City City 66502 785 39.195573 -96.569366 2554403.0 0 5637 2586 3051 938.56493 881.0 392.44096 1005.42886 1704.0 0.99286 0.98247 0.91688 0.84740 0.78247 0.60974 0.55455 0.44416 1725 1540 31834.15466 22497.0 34046.50907 1548.67477 1983.0 54053.42396 50272.0 39609.12605 244.08903 395.0 1192.58759 1125.0 327.49674 76.61052 134.0 443.68855 444.0 76.12674 124.0 79.55556 0.05426 0.05426 0.05426 0.51938 0.18332 0.65545 0.74967 1.00000 1.00000 1.00000 26.07533 22.41667 11.84399 1296.89877 2586.0 24.17693 21.58333 11.10484 1854.48652 3051.0 0.13046 0.12356 0.00000 0.00000 0.03109 0.05426 less than 10% 50-100% 0.94574 27775.0
In [66]:
state_rent = train.groupby('state')[['rent_median','hi_median']].median()
state_rent
Out[66]:
rent_median hi_median
state
Alabama 681.0 40203.0
Alaska 1071.5 69880.5
Arizona 950.0 48956.0
Arkansas 660.0 38973.5
California 1333.0 62252.5
Colorado 1090.0 61621.0
Connecticut 1169.0 76957.0
Delaware 1022.0 60642.0
District of Columbia 1123.0 68846.0
Florida 1012.0 45475.0
Georgia 854.0 45917.0
Hawaii 1416.0 72312.0
Idaho 731.0 45678.0
Illinois 912.0 54703.0
Indiana 736.0 46429.0
Iowa 652.0 51345.0
Kansas 709.0 47697.0
Kentucky 659.0 39967.0
Louisiana 772.0 42520.0
Maine 774.5 44858.0
Maryland 1285.0 76051.0
Massachusetts 1123.0 69613.0
Michigan 784.0 47870.5
Minnesota 852.5 58905.0
Mississippi 689.0 35492.0
Missouri 750.0 45179.5
Montana 679.0 45510.0
Nebraska 712.0 51472.0
Nevada 1046.0 52784.0
New Hampshire 991.0 66038.0
New Jersey 1253.5 73875.5
New Mexico 729.0 42156.0
New York 1197.0 59967.0
North Carolina 761.0 45158.0
North Dakota 655.0 54700.0
Ohio 727.0 47689.0
Oklahoma 686.0 43487.0
Oregon 906.0 50419.0
Pennsylvania 829.0 52820.0
Puerto Rico 457.5 17977.0
Rhode Island 947.0 56493.0
South Carolina 766.0 42153.0
South Dakota 616.0 48439.0
Tennessee 731.0 41577.0
Texas 845.5 48960.0
Utah 965.0 60968.5
Vermont 868.0 52658.0
Virginia 1112.0 62910.0
Washington 1002.0 58428.5
West Virginia 628.5 40308.0
Wisconsin 766.0 52108.0
Wyoming 802.5 58582.0
In [67]:
rent_as_pct_income_overall = (state_rent['rent_median'].sum() / state_rent['hi_median'].sum())*100
rent_as_pct_income_overall
Out[67]:
1.6687203615872597
In [68]:
state_rent['rent_as_pct_income'] = (state_rent['rent_median'] / state_rent['hi_median'])*100
state_rent
Out[68]:
rent_median hi_median rent_as_pct_income
state
Alabama 681.0 40203.0 1.693903
Alaska 1071.5 69880.5 1.533332
Arizona 950.0 48956.0 1.940518
Arkansas 660.0 38973.5 1.693458
California 1333.0 62252.5 2.141279
Colorado 1090.0 61621.0 1.768877
Connecticut 1169.0 76957.0 1.519030
Delaware 1022.0 60642.0 1.685301
District of Columbia 1123.0 68846.0 1.631177
Florida 1012.0 45475.0 2.225399
Georgia 854.0 45917.0 1.859878
Hawaii 1416.0 72312.0 1.958181
Idaho 731.0 45678.0 1.600333
Illinois 912.0 54703.0 1.667185
Indiana 736.0 46429.0 1.585216
Iowa 652.0 51345.0 1.269841
Kansas 709.0 47697.0 1.486467
Kentucky 659.0 39967.0 1.648860
Louisiana 772.0 42520.0 1.815616
Maine 774.5 44858.0 1.726559
Maryland 1285.0 76051.0 1.689656
Massachusetts 1123.0 69613.0 1.613204
Michigan 784.0 47870.5 1.637752
Minnesota 852.5 58905.0 1.447246
Mississippi 689.0 35492.0 1.941283
Missouri 750.0 45179.5 1.660045
Montana 679.0 45510.0 1.491980
Nebraska 712.0 51472.0 1.383276
Nevada 1046.0 52784.0 1.981661
New Hampshire 991.0 66038.0 1.500651
New Jersey 1253.5 73875.5 1.696774
New Mexico 729.0 42156.0 1.729291
New York 1197.0 59967.0 1.996098
North Carolina 761.0 45158.0 1.685194
North Dakota 655.0 54700.0 1.197441
Ohio 727.0 47689.0 1.524461
Oklahoma 686.0 43487.0 1.577483
Oregon 906.0 50419.0 1.796942
Pennsylvania 829.0 52820.0 1.569481
Puerto Rico 457.5 17977.0 2.544919
Rhode Island 947.0 56493.0 1.676314
South Carolina 766.0 42153.0 1.817190
South Dakota 616.0 48439.0 1.271703
Tennessee 731.0 41577.0 1.758184
Texas 845.5 48960.0 1.726920
Utah 965.0 60968.5 1.582785
Vermont 868.0 52658.0 1.648373
Virginia 1112.0 62910.0 1.767605
Washington 1002.0 58428.5 1.714917
West Virginia 628.5 40308.0 1.559244
Wisconsin 766.0 52108.0 1.470024
Wyoming 802.5 58582.0 1.369875
In [69]:
train['meadian_age'] = train_density_eda['median_age']
In [70]:
train.columns
Out[70]:
Index(['COUNTYID', 'STATEID', 'state', 'state_ab', 'city', 'place', 'type',
       'zip_code', 'area_code', 'lat', 'lng', 'ALand', 'AWater', 'pop',
       'male_pop', 'female_pop', 'rent_mean', 'rent_median', 'rent_stdev',
       'rent_sample_weight', 'rent_samples', 'rent_gt_10', 'rent_gt_15',
       'rent_gt_20', 'rent_gt_25', 'rent_gt_30', 'rent_gt_35', 'rent_gt_40',
       'rent_gt_50', 'universe_samples', 'used_samples', 'hi_mean',
       'hi_median', 'hi_stdev', 'hi_sample_weight', 'hi_samples',
       'family_mean', 'family_median', 'family_stdev', 'family_sample_weight',
       'family_samples', 'hc_mortgage_mean', 'hc_mortgage_median',
       'hc_mortgage_stdev', 'hc_mortgage_sample_weight', 'hc_mortgage_samples',
       'hc_mean', 'hc_median', 'hc_stdev', 'hc_samples', 'hc_sample_weight',
       'home_equity_second_mortgage', 'second_mortgage', 'home_equity', 'debt',
       'second_mortgage_cdf', 'home_equity_cdf', 'debt_cdf', 'hs_degree',
       'hs_degree_male', 'hs_degree_female', 'male_age_mean',
       'male_age_median', 'male_age_stdev', 'male_age_sample_weight',
       'male_age_samples', 'female_age_mean', 'female_age_median',
       'female_age_stdev', 'female_age_sample_weight', 'female_age_samples',
       'pct_own', 'married', 'married_snp', 'separated', 'divorced',
       'bad_debt', 'bins_bad_debt', 'bins_debt', 'good_debt',
       'remaining_income', 'meadian_age'],
      dtype='object')
In [71]:
cor = train[['COUNTYID','STATEID','zip_code','type','pop','male_pop','female_pop', 'family_mean',
         'second_mortgage', 'home_equity', 'debt','hs_degree','hi_mean','hc_mean',
           'meadian_age','pct_own', 'married','separated', 'divorced']].corr()
In [72]:
plt.figure(figsize=(12,12))
sns.heatmap(cor,annot=True)
plt.show()
In [ ]:
 
In [73]:
from sklearn.decomposition import FactorAnalysis
In [74]:
fa = FactorAnalysis(n_components=5,random_state=11)
In [75]:
train_transformed = fa.fit_transform(train.select_dtypes(exclude=('object','category')))
In [76]:
train_transformed.shape
Out[76]:
(26585, 5)
In [77]:
train_transformed
Out[77]:
array([[-0.02174625,  0.06978994,  1.25438432, -0.3379435 , -0.3368652 ],
       [-0.02897086, -0.09746471,  0.1120596 , -0.88441509, -0.28839431],
       [-0.02814015, -0.04003848,  0.13308064,  0.53261952, -0.4300001 ],
       ...,
       [ 0.05000689,  0.99014488, -0.97034794,  0.46360024, -0.3329646 ],
       [-0.02870642, -0.08305636, -0.88574606,  3.11451733, -0.50294382],
       [-0.02942439, -0.09195811, -1.33280849, -0.70352571, -0.09956675]])
In [78]:
x_train = pd.read_csv('train.csv')
x_test = pd.read_csv('test.csv')
In [79]:
x_train.drop(['BLOCKID','SUMLEVEL'],axis=1,inplace=True)
In [80]:
x_train.dropna(axis=0,inplace=True)
x_train.head()
Out[80]:
UID COUNTYID STATEID state state_ab city place type primary zip_code area_code lat lng ALand AWater pop male_pop female_pop rent_mean rent_median rent_stdev rent_sample_weight rent_samples rent_gt_10 rent_gt_15 rent_gt_20 rent_gt_25 rent_gt_30 rent_gt_35 rent_gt_40 rent_gt_50 universe_samples used_samples hi_mean hi_median hi_stdev hi_sample_weight hi_samples family_mean family_median family_stdev family_sample_weight family_samples hc_mortgage_mean hc_mortgage_median hc_mortgage_stdev hc_mortgage_sample_weight hc_mortgage_samples hc_mean hc_median hc_stdev hc_samples hc_sample_weight home_equity_second_mortgage second_mortgage home_equity debt second_mortgage_cdf home_equity_cdf debt_cdf hs_degree hs_degree_male hs_degree_female male_age_mean male_age_median male_age_stdev male_age_sample_weight male_age_samples female_age_mean female_age_median female_age_stdev female_age_sample_weight female_age_samples pct_own married married_snp separated divorced
0 267822 53 36 New York NY Hamilton Hamilton City tract 13346 315 42.840812 -75.501524 202183361.0 1699120 5230 2612 2618 769.38638 784.0 232.63967 272.34441 362.0 0.86761 0.79155 0.59155 0.45634 0.42817 0.18592 0.15493 0.12958 387 355 63125.28406 48120.0 49042.01206 1290.96240 2024.0 67994.14790 53245.0 47667.30119 884.33516 1491.0 1414.80295 1223.0 641.22898 377.83135 867.0 570.01530 558.0 270.11299 770.0 499.29293 0.01588 0.02077 0.08919 0.52963 0.43658 0.49087 0.73341 0.89288 0.85880 0.92434 42.48574 44.00000 22.97306 696.42136 2612.0 44.48629 45.33333 22.51276 685.33845 2618.0 0.79046 0.57851 0.01882 0.01240 0.08770
1 246444 141 18 Indiana IN South Bend Roseland City tract 46616 574 41.701441 -86.266614 1560828.0 100363 2633 1349 1284 804.87924 848.0 253.46747 312.58622 513.0 0.97410 0.93227 0.69920 0.69920 0.55179 0.41235 0.39044 0.27888 542 502 41931.92593 35186.0 31639.50203 838.74664 1127.0 50670.10337 43023.0 34715.57548 375.28798 554.0 864.41390 784.0 482.27020 316.88320 356.0 351.98293 336.0 125.40457 229.0 189.60606 0.02222 0.02222 0.04274 0.60855 0.42174 0.70823 0.58120 0.90487 0.86947 0.94187 34.84728 32.00000 20.37452 323.90204 1349.0 36.48391 37.58333 23.43353 267.23367 1284.0 0.52483 0.34886 0.01426 0.01426 0.09030
2 245683 63 18 Indiana IN Danville Danville City tract 46122 317 39.792202 -86.515246 69561595.0 284193 6881 3643 3238 742.77365 703.0 323.39011 291.85520 378.0 0.95238 0.88624 0.79630 0.66667 0.39153 0.39153 0.28307 0.15873 459 378 84942.68317 74964.0 56811.62186 1155.20980 2488.0 95262.51431 85395.0 49292.67664 709.74925 1889.0 1506.06758 1361.0 731.89394 699.41354 1491.0 556.45986 532.0 184.42175 538.0 323.35354 0.00000 0.00000 0.09512 0.73484 1.00000 0.46332 0.28704 0.94288 0.94616 0.93952 39.38154 40.83333 22.89769 888.29730 3643.0 42.15810 42.83333 23.94119 707.01963 3238.0 0.85331 0.64745 0.02830 0.01607 0.10657
3 279653 127 72 Puerto Rico PR San Juan Guaynabo Urban tract 927 787 18.396103 -66.104169 1105793.0 0 2700 1141 1559 803.42018 782.0 297.39258 259.30316 368.0 0.94693 0.87151 0.69832 0.61732 0.51397 0.46927 0.35754 0.32961 438 358 48733.67116 37845.0 45100.54010 928.32193 1267.0 56401.68133 44399.0 41082.90515 490.18479 729.0 1175.28642 1101.0 428.98751 261.28471 437.0 288.04047 247.0 185.55887 392.0 314.90566 0.01086 0.01086 0.01086 0.52714 0.53057 0.82530 0.73727 0.91500 0.90755 0.92043 48.64749 48.91667 23.05968 274.98956 1141.0 47.77526 50.58333 24.32015 362.20193 1559.0 0.65037 0.47257 0.02021 0.02021 0.10106
4 247218 161 20 Kansas KS Manhattan Manhattan City City tract 66502 785 39.195573 -96.569366 2554403.0 0 5637 2586 3051 938.56493 881.0 392.44096 1005.42886 1704.0 0.99286 0.98247 0.91688 0.84740 0.78247 0.60974 0.55455 0.44416 1725 1540 31834.15466 22497.0 34046.50907 1548.67477 1983.0 54053.42396 50272.0 39609.12605 244.08903 395.0 1192.58759 1125.0 327.49674 76.61052 134.0 443.68855 444.0 76.12674 124.0 79.55556 0.05426 0.05426 0.05426 0.51938 0.18332 0.65545 0.74967 1.00000 1.00000 1.00000 26.07533 22.41667 11.84399 1296.89877 2586.0 24.17693 21.58333 11.10484 1854.48652 3051.0 0.13046 0.12356 0.00000 0.00000 0.03109
In [81]:
x_train.drop_duplicates(inplace=True)
In [82]:
x_train.shape
Out[82]:
(26585, 78)
In [83]:
x_test.head()
Out[83]:
UID BLOCKID SUMLEVEL COUNTYID STATEID state state_ab city place type primary zip_code area_code lat lng ALand AWater pop male_pop female_pop rent_mean rent_median rent_stdev rent_sample_weight rent_samples rent_gt_10 rent_gt_15 rent_gt_20 rent_gt_25 rent_gt_30 rent_gt_35 rent_gt_40 rent_gt_50 universe_samples used_samples hi_mean hi_median hi_stdev hi_sample_weight hi_samples family_mean family_median family_stdev family_sample_weight family_samples hc_mortgage_mean hc_mortgage_median hc_mortgage_stdev hc_mortgage_sample_weight hc_mortgage_samples hc_mean hc_median hc_stdev hc_samples hc_sample_weight home_equity_second_mortgage second_mortgage home_equity debt second_mortgage_cdf home_equity_cdf debt_cdf hs_degree hs_degree_male hs_degree_female male_age_mean male_age_median male_age_stdev male_age_sample_weight male_age_samples female_age_mean female_age_median female_age_stdev female_age_sample_weight female_age_samples pct_own married married_snp separated divorced
0 255504 NaN 140 163 26 Michigan MI Detroit Dearborn Heights City CDP tract 48239 313 42.346422 -83.252823 2711280 39555 3417 1479 1938 858.57169 859.0 232.39082 276.07497 424.0 1.00000 0.95696 0.85316 0.85316 0.85316 0.85316 0.76962 0.63544 435 395 48899.52121 38746.0 44392.20902 798.02401 1180.0 53802.87122 45167.0 43756.56479 464.30972 769.0 1139.24548 1109.0 336.47710 262.67011 474.0 488.51323 436.0 192.75147 271.0 189.18182 0.06443 0.06443 0.07651 0.63624 0.14111 0.55087 0.51965 0.91047 0.92010 0.90391 33.37131 27.83333 22.36768 334.30978 1479.0 34.78682 33.75000 21.58531 416.48097 1938.0 0.70252 0.28217 0.05910 0.03813 0.14299
1 252676 NaN 140 1 23 Maine ME Auburn Auburn City City tract 4210 207 44.100724 -70.257832 14778785 2705204 3796 1846 1950 832.68625 750.0 267.22342 183.32299 245.0 1.00000 1.00000 0.86611 0.67364 0.30962 0.30962 0.30962 0.27197 275 239 72335.33234 61008.0 51895.81159 922.82969 1722.0 85642.22095 74759.0 49156.72870 482.99945 1147.0 1533.25988 1438.0 536.61118 373.96188 937.0 661.31296 668.0 201.31365 510.0 279.69697 0.01175 0.01175 0.14375 0.64755 0.52310 0.26442 0.49359 0.94290 0.92832 0.95736 43.88680 46.08333 22.90302 427.10824 1846.0 44.23451 46.66667 22.37036 532.03505 1950.0 0.85128 0.64221 0.02338 0.00000 0.13377
2 276314 NaN 140 15 42 Pennsylvania PA Pine City Millerton Borough tract 14871 607 41.948556 -76.783808 258903666 863840 3944 2065 1879 816.00639 755.0 416.25699 141.39063 217.0 0.97573 0.93204 0.78641 0.71845 0.63592 0.47573 0.43689 0.32524 245 206 58501.15901 51648.0 45245.27248 893.07759 1461.0 65694.06582 57186.0 44239.31893 619.73962 1084.0 1254.54462 1089.0 596.85204 340.45884 552.0 397.44466 356.0 189.40372 664.0 534.16737 0.01069 0.01316 0.06497 0.45395 0.51066 0.60484 0.83848 0.89238 0.86003 0.92463 39.81661 41.91667 24.29111 499.10080 2065.0 41.62426 44.50000 22.86213 453.11959 1879.0 0.81897 0.59961 0.01746 0.01358 0.10026
3 248614 NaN 140 231 21 Kentucky KY Monticello Monticello City City tract 42633 606 36.746009 -84.766870 501694825 2623067 2508 1427 1081 418.68937 385.0 156.92024 88.95960 93.0 1.00000 0.93548 0.93548 0.64516 0.55914 0.46237 0.46237 0.36559 153 93 38237.55059 31612.0 34527.61607 775.17947 957.0 44156.38709 34687.0 34899.74300 535.21987 689.0 862.65763 749.0 624.42157 299.56752 337.0 200.88113 180.0 91.56490 467.0 454.85404 0.00995 0.00995 0.01741 0.41915 0.53770 0.80931 0.87403 0.60908 0.56584 0.65947 41.81638 43.00000 24.65325 333.57733 1427.0 44.81200 48.00000 21.03155 263.94320 1081.0 0.84609 0.56953 0.05492 0.04694 0.12489
4 286865 NaN 140 355 48 Texas TX Corpus Christi Edroy Town tract 78410 361 27.882461 -97.678586 13796057 497689 6230 3274 2956 1031.63763 997.0 326.76727 277.39844 624.0 0.72276 0.66506 0.53526 0.38301 0.18910 0.16667 0.14263 0.11058 660 624 114456.07790 94211.0 81950.95692 836.30759 2404.0 123527.02420 103898.0 72173.55823 507.42257 1738.0 1996.41425 1907.0 740.21168 319.97570 1102.0 867.57713 804.0 376.20236 642.0 333.91919 0.00000 0.00000 0.03440 0.63188 1.00000 0.74519 0.52943 0.86297 0.87969 0.84466 42.13301 43.75000 22.69502 833.57435 3274.0 40.66618 42.66667 21.30900 709.90829 2956.0 0.79077 0.57620 0.01726 0.00588 0.16379
In [84]:
x_test.shape
Out[84]:
(11709, 80)
In [85]:
x_test.drop(['BLOCKID','SUMLEVEL'],axis=1,inplace=True)
In [86]:
x_test.isna().sum()
Out[86]:
UID              0
COUNTYID         0
STATEID          0
state            0
state_ab         0
              ... 
pct_own        122
married         84
married_snp     84
separated       84
divorced        84
Length: 78, dtype: int64
In [87]:
x_test.dropna(axis=0,inplace=True)
In [88]:
x_test.drop_duplicates(inplace=True)
In [89]:
x_test.shape
Out[89]:
(11355, 78)
In [90]:
imp_feature = x_train.select_dtypes(exclude=('object','category'))
In [91]:
imp_feature.head()
Out[91]:
UID COUNTYID STATEID zip_code area_code lat lng ALand AWater pop male_pop female_pop rent_mean rent_median rent_stdev rent_sample_weight rent_samples rent_gt_10 rent_gt_15 rent_gt_20 rent_gt_25 rent_gt_30 rent_gt_35 rent_gt_40 rent_gt_50 universe_samples used_samples hi_mean hi_median hi_stdev hi_sample_weight hi_samples family_mean family_median family_stdev family_sample_weight family_samples hc_mortgage_mean hc_mortgage_median hc_mortgage_stdev hc_mortgage_sample_weight hc_mortgage_samples hc_mean hc_median hc_stdev hc_samples hc_sample_weight home_equity_second_mortgage second_mortgage home_equity debt second_mortgage_cdf home_equity_cdf debt_cdf hs_degree hs_degree_male hs_degree_female male_age_mean male_age_median male_age_stdev male_age_sample_weight male_age_samples female_age_mean female_age_median female_age_stdev female_age_sample_weight female_age_samples pct_own married married_snp separated divorced
0 267822 53 36 13346 315 42.840812 -75.501524 202183361.0 1699120 5230 2612 2618 769.38638 784.0 232.63967 272.34441 362.0 0.86761 0.79155 0.59155 0.45634 0.42817 0.18592 0.15493 0.12958 387 355 63125.28406 48120.0 49042.01206 1290.96240 2024.0 67994.14790 53245.0 47667.30119 884.33516 1491.0 1414.80295 1223.0 641.22898 377.83135 867.0 570.01530 558.0 270.11299 770.0 499.29293 0.01588 0.02077 0.08919 0.52963 0.43658 0.49087 0.73341 0.89288 0.85880 0.92434 42.48574 44.00000 22.97306 696.42136 2612.0 44.48629 45.33333 22.51276 685.33845 2618.0 0.79046 0.57851 0.01882 0.01240 0.08770
1 246444 141 18 46616 574 41.701441 -86.266614 1560828.0 100363 2633 1349 1284 804.87924 848.0 253.46747 312.58622 513.0 0.97410 0.93227 0.69920 0.69920 0.55179 0.41235 0.39044 0.27888 542 502 41931.92593 35186.0 31639.50203 838.74664 1127.0 50670.10337 43023.0 34715.57548 375.28798 554.0 864.41390 784.0 482.27020 316.88320 356.0 351.98293 336.0 125.40457 229.0 189.60606 0.02222 0.02222 0.04274 0.60855 0.42174 0.70823 0.58120 0.90487 0.86947 0.94187 34.84728 32.00000 20.37452 323.90204 1349.0 36.48391 37.58333 23.43353 267.23367 1284.0 0.52483 0.34886 0.01426 0.01426 0.09030
2 245683 63 18 46122 317 39.792202 -86.515246 69561595.0 284193 6881 3643 3238 742.77365 703.0 323.39011 291.85520 378.0 0.95238 0.88624 0.79630 0.66667 0.39153 0.39153 0.28307 0.15873 459 378 84942.68317 74964.0 56811.62186 1155.20980 2488.0 95262.51431 85395.0 49292.67664 709.74925 1889.0 1506.06758 1361.0 731.89394 699.41354 1491.0 556.45986 532.0 184.42175 538.0 323.35354 0.00000 0.00000 0.09512 0.73484 1.00000 0.46332 0.28704 0.94288 0.94616 0.93952 39.38154 40.83333 22.89769 888.29730 3643.0 42.15810 42.83333 23.94119 707.01963 3238.0 0.85331 0.64745 0.02830 0.01607 0.10657
3 279653 127 72 927 787 18.396103 -66.104169 1105793.0 0 2700 1141 1559 803.42018 782.0 297.39258 259.30316 368.0 0.94693 0.87151 0.69832 0.61732 0.51397 0.46927 0.35754 0.32961 438 358 48733.67116 37845.0 45100.54010 928.32193 1267.0 56401.68133 44399.0 41082.90515 490.18479 729.0 1175.28642 1101.0 428.98751 261.28471 437.0 288.04047 247.0 185.55887 392.0 314.90566 0.01086 0.01086 0.01086 0.52714 0.53057 0.82530 0.73727 0.91500 0.90755 0.92043 48.64749 48.91667 23.05968 274.98956 1141.0 47.77526 50.58333 24.32015 362.20193 1559.0 0.65037 0.47257 0.02021 0.02021 0.10106
4 247218 161 20 66502 785 39.195573 -96.569366 2554403.0 0 5637 2586 3051 938.56493 881.0 392.44096 1005.42886 1704.0 0.99286 0.98247 0.91688 0.84740 0.78247 0.60974 0.55455 0.44416 1725 1540 31834.15466 22497.0 34046.50907 1548.67477 1983.0 54053.42396 50272.0 39609.12605 244.08903 395.0 1192.58759 1125.0 327.49674 76.61052 134.0 443.68855 444.0 76.12674 124.0 79.55556 0.05426 0.05426 0.05426 0.51938 0.18332 0.65545 0.74967 1.00000 1.00000 1.00000 26.07533 22.41667 11.84399 1296.89877 2586.0 24.17693 21.58333 11.10484 1854.48652 3051.0 0.13046 0.12356 0.00000 0.00000 0.03109
In [92]:
imp_feature.shape
Out[92]:
(26585, 72)
In [93]:
to_drop = ['UID','COUNTYID', 'STATEID', 'zip_code', 'area_code', 'lat', 'lng']
In [94]:
for col in imp_feature.columns:
    if col in to_drop:
        imp_feature.drop(col,axis=1,inplace=True)
/usr/local/lib/python3.7/site-packages/pandas/core/frame.py:3997: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [95]:
imp_feature.head()
Out[95]:
ALand AWater pop male_pop female_pop rent_mean rent_median rent_stdev rent_sample_weight rent_samples rent_gt_10 rent_gt_15 rent_gt_20 rent_gt_25 rent_gt_30 rent_gt_35 rent_gt_40 rent_gt_50 universe_samples used_samples hi_mean hi_median hi_stdev hi_sample_weight hi_samples family_mean family_median family_stdev family_sample_weight family_samples hc_mortgage_mean hc_mortgage_median hc_mortgage_stdev hc_mortgage_sample_weight hc_mortgage_samples hc_mean hc_median hc_stdev hc_samples hc_sample_weight home_equity_second_mortgage second_mortgage home_equity debt second_mortgage_cdf home_equity_cdf debt_cdf hs_degree hs_degree_male hs_degree_female male_age_mean male_age_median male_age_stdev male_age_sample_weight male_age_samples female_age_mean female_age_median female_age_stdev female_age_sample_weight female_age_samples pct_own married married_snp separated divorced
0 202183361.0 1699120 5230 2612 2618 769.38638 784.0 232.63967 272.34441 362.0 0.86761 0.79155 0.59155 0.45634 0.42817 0.18592 0.15493 0.12958 387 355 63125.28406 48120.0 49042.01206 1290.96240 2024.0 67994.14790 53245.0 47667.30119 884.33516 1491.0 1414.80295 1223.0 641.22898 377.83135 867.0 570.01530 558.0 270.11299 770.0 499.29293 0.01588 0.02077 0.08919 0.52963 0.43658 0.49087 0.73341 0.89288 0.85880 0.92434 42.48574 44.00000 22.97306 696.42136 2612.0 44.48629 45.33333 22.51276 685.33845 2618.0 0.79046 0.57851 0.01882 0.01240 0.08770
1 1560828.0 100363 2633 1349 1284 804.87924 848.0 253.46747 312.58622 513.0 0.97410 0.93227 0.69920 0.69920 0.55179 0.41235 0.39044 0.27888 542 502 41931.92593 35186.0 31639.50203 838.74664 1127.0 50670.10337 43023.0 34715.57548 375.28798 554.0 864.41390 784.0 482.27020 316.88320 356.0 351.98293 336.0 125.40457 229.0 189.60606 0.02222 0.02222 0.04274 0.60855 0.42174 0.70823 0.58120 0.90487 0.86947 0.94187 34.84728 32.00000 20.37452 323.90204 1349.0 36.48391 37.58333 23.43353 267.23367 1284.0 0.52483 0.34886 0.01426 0.01426 0.09030
2 69561595.0 284193 6881 3643 3238 742.77365 703.0 323.39011 291.85520 378.0 0.95238 0.88624 0.79630 0.66667 0.39153 0.39153 0.28307 0.15873 459 378 84942.68317 74964.0 56811.62186 1155.20980 2488.0 95262.51431 85395.0 49292.67664 709.74925 1889.0 1506.06758 1361.0 731.89394 699.41354 1491.0 556.45986 532.0 184.42175 538.0 323.35354 0.00000 0.00000 0.09512 0.73484 1.00000 0.46332 0.28704 0.94288 0.94616 0.93952 39.38154 40.83333 22.89769 888.29730 3643.0 42.15810 42.83333 23.94119 707.01963 3238.0 0.85331 0.64745 0.02830 0.01607 0.10657
3 1105793.0 0 2700 1141 1559 803.42018 782.0 297.39258 259.30316 368.0 0.94693 0.87151 0.69832 0.61732 0.51397 0.46927 0.35754 0.32961 438 358 48733.67116 37845.0 45100.54010 928.32193 1267.0 56401.68133 44399.0 41082.90515 490.18479 729.0 1175.28642 1101.0 428.98751 261.28471 437.0 288.04047 247.0 185.55887 392.0 314.90566 0.01086 0.01086 0.01086 0.52714 0.53057 0.82530 0.73727 0.91500 0.90755 0.92043 48.64749 48.91667 23.05968 274.98956 1141.0 47.77526 50.58333 24.32015 362.20193 1559.0 0.65037 0.47257 0.02021 0.02021 0.10106
4 2554403.0 0 5637 2586 3051 938.56493 881.0 392.44096 1005.42886 1704.0 0.99286 0.98247 0.91688 0.84740 0.78247 0.60974 0.55455 0.44416 1725 1540 31834.15466 22497.0 34046.50907 1548.67477 1983.0 54053.42396 50272.0 39609.12605 244.08903 395.0 1192.58759 1125.0 327.49674 76.61052 134.0 443.68855 444.0 76.12674 124.0 79.55556 0.05426 0.05426 0.05426 0.51938 0.18332 0.65545 0.74967 1.00000 1.00000 1.00000 26.07533 22.41667 11.84399 1296.89877 2586.0 24.17693 21.58333 11.10484 1854.48652 3051.0 0.13046 0.12356 0.00000 0.00000 0.03109
In [96]:
x_train_features = imp_feature[['pop','rent_median','hi_median','family_median','hc_mean','second_mortgage','home_equity','debt','hs_degree','pct_own','married','separated','divorced']]
In [97]:
x_train_features.head()
Out[97]:
pop rent_median hi_median family_median hc_mean second_mortgage home_equity debt hs_degree pct_own married separated divorced
0 5230 784.0 48120.0 53245.0 570.01530 0.02077 0.08919 0.52963 0.89288 0.79046 0.57851 0.01240 0.08770
1 2633 848.0 35186.0 43023.0 351.98293 0.02222 0.04274 0.60855 0.90487 0.52483 0.34886 0.01426 0.09030
2 6881 703.0 74964.0 85395.0 556.45986 0.00000 0.09512 0.73484 0.94288 0.85331 0.64745 0.01607 0.10657
3 2700 782.0 37845.0 44399.0 288.04047 0.01086 0.01086 0.52714 0.91500 0.65037 0.47257 0.02021 0.10106
4 5637 881.0 22497.0 50272.0 443.68855 0.05426 0.05426 0.51938 1.00000 0.13046 0.12356 0.00000 0.03109
In [98]:
x_train_features.shape
Out[98]:
(26585, 13)
In [99]:
y_train = imp_feature['hc_mortgage_mean']
In [100]:
x_test_feature = x_test[['pop','rent_median','hi_median','family_median','hc_mean','second_mortgage','home_equity','debt','hs_degree','pct_own','married','separated','divorced']]
In [101]:
from sklearn.linear_model import LinearRegression
le = LinearRegression()
In [102]:
le.fit(x_train_features,y_train)
Out[102]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
In [103]:
y_pred = le.predict(x_test_feature)
In [104]:
y_test = x_test['hc_mortgage_mean']
In [105]:
from sklearn.metrics import r2_score,mean_squared_error
In [106]:
r2_score(y_test,y_pred)
Out[106]:
0.8073813546881963
In [107]:
np.sqrt(mean_squared_error(y_test,y_pred))
Out[107]:
277.0451838858074
In [108]:
sns.distplot(y_pred)
plt.show()
In [ ]: